In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

%matplotlib inline
In [2]:
data = pd.read_csv("diabetes2.csv")
In [3]:
data.head()
Out[3]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [4]:
data_temp = data
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [6]:
data.shape
Out[6]:
(768, 9)
In [7]:
data['Outcome'].value_counts()
Out[7]:
0    500
1    268
Name: Outcome, dtype: int64
In [8]:
data.isnull().sum()
Out[8]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [9]:
sns.catplot(x="Outcome",kind="count",data=data)
plt.show()
In [10]:
# The visulization count of Age of their Diabetics
ax = sns.catplot(x="Age",kind="count",hue="Outcome",data=data,palette="pastel",legend=False)
ax.fig.set_figwidth(20)
ax.fig.set_figheight(10)
plt.legend(loc="upper right",labels=["Non diabetic","Diabetic"])
plt.show()
In [11]:
# Age Distribution by outcome 0

fig = px.histogram(data,x=data[data["Outcome"]==0].Age,marginal="box",color_discrete_sequence=['lightgreen'])
fig.show()
In [12]:
# Age distribution by Outcome 0
fig = px.histogram(data, x=data[data.Outcome==1].Age,
                   marginal="box",
                   color_discrete_sequence=['red'])
fig.show()
In [13]:
data[data['Outcome']==1].Glucose.mean()
Out[13]:
141.25746268656715
In [14]:
x = data.drop(['Outcome'],axis=1)
y = data.loc[:,"Outcome"].values
In [15]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size = 0.33, random_state= 123)
In [16]:
print("xtrain",xtrain.shape)
print("ytrain",ytrain.shape)
print("xtest",xtest.shape)
print("ytest",ytest.shape)
xtrain (514, 8)
ytrain (514,)
xtest (254, 8)
ytest (254,)
In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', max_iter = 1000)
model.fit(xtrain,ytrain)
Out[17]:
LogisticRegression(max_iter=1000, solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000, solver='liblinear')
In [18]:
xpred = model.predict(xtrain)
In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytrain,xpred)
Out[19]:
array([[312,  31],
       [ 90,  81]], dtype=int64)
In [20]:
#train score
from sklearn.metrics import accuracy_score
score = accuracy_score(ytrain, xpred)
score
Out[20]:
0.7645914396887159
In [21]:
ypred = model.predict(xtest)
In [22]:
confusion_matrix(ypred, ytest)
Out[22]:
array([[143,  38],
       [ 14,  59]], dtype=int64)
In [23]:
#test score
accuracy_score(ypred, ytest)
Out[23]:
0.7952755905511811
In [24]:
cm1 = confusion_matrix(ytest, ypred)
sns.heatmap(cm1, annot=True, fmt=".0f")
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.title("Accuracy Score:{0}".format(score),size=15)
plt.show()
In [25]:
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_curve,roc_auc_score
print(classification_report(ytest,ypred))
              precision    recall  f1-score   support

           0       0.79      0.91      0.85       157
           1       0.81      0.61      0.69        97

    accuracy                           0.80       254
   macro avg       0.80      0.76      0.77       254
weighted avg       0.80      0.80      0.79       254

In [26]:
test_data = [[0,150,33.7,50,150,74,0.5,53]]

testData = pd.DataFrame(test_data,columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'])

result = model.predict(testData)
result[0]
Out[26]:
1
In [27]:
import joblib
joblib.dump(model,"diabetics.pkl")
Out[27]:
['diabetics.pkl']
In [ ]: